###############################################################################
# This script loads the SOI data (at the school district level) and merge it to the referendums

###############################################################################

## Get the change in itemizers based on the year 2018 (post-TCJA) and 2017 (pre-TCJA)


##
soi <- read.csv("Data/Raw Data/soi_school_districts.csv")
soi$School_District <- sprintf("%07d",soi$School_District)
soi <- soi[substr(soi$School_District,1,2)=="06",]#Keep California

# Get 2018 and 2017 and create first difference variables
soi18 <- soi[soi$year ==2018,]
soi17 <- soi[soi$year ==2017,]

soi18 <- soi18[,c("School_District","itm_prop_perc","itm_prop_amnt","itm_salt_amnt_capped","itm_salt_amnt","share_pop_missing")]
soi17 <- soi17[,c("School_District","itm_prop_perc","itm_prop_amnt","itm_salt_amnt_capped","itm_salt_amnt","share_pop_missing")]

colnames(soi18)[2:length(soi18)] <- paste(colnames(soi18)[2:length(soi18)],"18",sep = "_")
colnames(soi17)[2:length(soi17)] <- paste(colnames(soi17)[2:length(soi17)],"17",sep = "_")
change <- merge(soi17,soi18)


#########################
## Create the change variables used in the study

# change in share of property tax deducters
change$itm_prop_change <- change$itm_prop_perc_17 - change$itm_prop_perc_18 

# Wassted SALT defined as amount of SALT not deducted because of the cap post-TCJA -- pmax(0,...): deals with errors in reporting. 
change$wasted_salt <- pmax(0,change$itm_salt_amnt_18 - change$itm_salt_amnt_capped_18)
change$wasted_salt_perc <- pmax(0,(change$itm_salt_amnt_18 - change$itm_salt_amnt_capped_18)/change$itm_salt_amnt_18)

# Change in SALT deducted (dollars and % change)
change$salt_amt_change <- (change$itm_salt_amnt_17 - change$itm_salt_amnt_capped_18)
change$salt_amt_change_perc <- (change$itm_salt_amnt_17 - change$itm_salt_amnt_capped_18)/change$itm_salt_amnt_17

# Create dummy variable based on above/below median of change 
change$highpropchange <- ifelse(change$itm_prop_change >= median(change$itm_prop_change),1,0)

change$share_pop_missing <- change$share_pop_missing_17
change$share_pop_missing_17 <- NULL
change$share_pop_missing_18 <- NULL

# Keep variables needed
change <- change[,c("School_District","itm_prop_perc_17","itm_prop_change","salt_amt_change_perc","wasted_salt","wasted_salt_perc","salt_amt_change","highpropchange","share_pop_missing")]


# Merge with referendums.
df <- merge(df,change,by.x = "LEAID",by.y = "School_District",all.x = TRUE)

# remove temporary files
rm(soi17,soi18,soi,change)

########################################################
# Get the annual cost of public goods net of deductions from SOI data

########################################################

## Load school district SOI data
soi <- read.csv("Data/Raw data/soi_school_districts.csv")

## Create 7-digits ID
soi$School_District <- sprintf("%07d",soi$School_District)

## Keep only California school districts
soi <- soi[substr(soi$School_District,1,2)=="06",]
soi <- soi[,c("year","School_District","itm_perc","itm_prop_perc","nb","share_land_missing","share_pop_missing","avg_taxrate","avg_costLPG","avg_taxrate_weighted_AGI","avg_costLPG_weighted_AGI")]


## For missing year, use the closest year for which we have data
soi20 <- soi[soi$year ==2020,]
soi21 <- soi20
soi21$year <- 2021
soi22 <- soi20
soi22$year <- 2022
soi09 <- soi[soi$year ==2009,]
soi08 <- soi09
soi08$year <- 2008
soi07 <- soi09
soi07$year <- 2007

soi <- rbind(soi,soi21,soi22,soi08,soi07)
rm(soi07,soi08,soi09,soi20,soi21,soi22)


# Keep variable used in analysis
soi <- soi[,c("year","School_District","avg_costLPG","avg_costLPG_weighted_AGI")]

# For NA values in the weighted cost for LPG, use the non-weighted version
soi$avg_costLPG_weighted_AGI <- ifelse(is.na(soi$avg_costLPG_weighted_AGI),soi$avg_costLPG,soi$avg_costLPG_weighted_AGI)
soi$avg_costLPG <- NULL
# Merge with referendums such that referendums in 2020 are matched with SOI of 2019 given tax-filing lags.
soi$year <- soi$year +1
df <- merge(df,soi,by.x = c("LEAID","year"),by.y = c("School_District","year"))
rm(soi)


# Remove school districts for which the SOI data does not cover 50% of the population
df <- df[!df$share_pop_missing>0.5,]
df$share_pop_missing <- NULL